import pandas as pd
from _funcs import parallel_read_csv, get_chunks, read_csv_chunk

## Genearate firm dictionary
## Zihao Li. 06/2024

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

def main():
    print('Loading data...')
    df = parallel_read_csv(dir + 'cleandata/g_patent_clean_final.csv', file_type='csv')
    print('Shape of dataset is {}.'.format(df.shape)) # (7481690, 141)
    df = df[(df['patent_year'] >= 1981) & (df['patent_year'] <= 2015)]
    df = df[['patent_id','assignee_organization', 'assignee_id']]
    df = df.dropna(subset=['assignee_organization'])
    print('Shape of dataset (after dropping empty assignee_organization and limiting year range) is {}.'.format(df.shape)) # (4370221, 3)

    # Merge KPSS firms
    kpss = pd.read_csv(dir + 'rawdata/KPSS_2020_public.csv', low_memory=False)
    kpss.rename(columns={'patent_num': 'patent_id'}, inplace=True)
    df = df.merge(kpss, how='inner', on='patent_id')
    print('Shape of dataset (after merging with KPSS) is {}.'.format(df.shape)) # (1813242, 9)

    print('Counting patent for each firm...')
    df = df.groupby('assignee_id').agg(assignee_organization=('assignee_organization', 'first'), patcount=('patent_id', 'count')).reset_index()    
    df = df.sort_values(by=['patcount'], ascending=False).reset_index(drop=True)
    df['assignee_rank'] = list(range(1,df.shape[0]+1))
    print('Total number of firms', df.shape[0]) # 17576

    df.to_stata(dir + 'temp/firmdict.dta', write_index=False, version=118)


if __name__ == "__main__":
    main()


